[MachineLICM] Use RegisterClassInfo::getRegPressureSetLimit#119826
[MachineLICM] Use RegisterClassInfo::getRegPressureSetLimit#119826
RegisterClassInfo::getRegPressureSetLimit#119826Conversation
|
@llvm/pr-subscribers-backend-loongarch @llvm/pr-subscribers-llvm-transforms Author: Pengcheng Wang (wangpc-pp) Changes
It seems that we shouldn't use Separate from #118787 Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff 50 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Pengcheng Wang (wangpc-pp) Changes
It seems that we shouldn't use Separate from #118787 Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff 50 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
|
|
@llvm/pr-subscribers-backend-nvptx Author: Pengcheng Wang (wangpc-pp) Changes
It seems that we shouldn't use Separate from #118787 Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff 50 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
|
|
Ping. |
1 similar comment
|
Ping. |
`RegisterClassInfo::getRegPressureSetLimit` is a wrapper of `TargetRegisterInfo::getRegPressureSetLimit` with some logics to adjust the limit by removing reserved registers. It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit` directly, just like the comment "This limit must be adjusted dynamically for reserved registers" said. Separate from llvm#118787
922617d to
0cee5a6
Compare
|
I've reverted this in eeac0ff, because it causes a very large compile-time regression, see https://llvm-compile-time-tracker.com/compare.php?from=e3e26dc41a6ad78c35a1a723cd77f5db8599797d&to=b4e17d4a314ed87ff6b40b4b05397d4b25b6636a&stat=instructions:u. If you need a test case for analysis, a good one is probably k.cc from kimwitu++, which regresses by more than 6%. |
Thanks for reporting, I have no surprise about compile-time increase. I was going to make |
…llvm#119826)" This reverts commit b4e17d4. This causes a large compile-time regression.
RegisterClassInfo::getRegPressureSetLimitis a wrapper ofTargetRegisterInfo::getRegPressureSetLimitwith some logics toadjust the limit by removing reserved registers.
It seems that we shouldn't use
TargetRegisterInfo::getRegPressureSetLimitdirectly, just like the comment "This limit must be adjusted
dynamically for reserved registers" said.
Separate from #118787